notebook.community

Edit and run



In [1]:

    
import re



In [6]:

    
#crating regex for give sample
regex = re.compile(r'<td>(.+?) &lt\;(.+\(at\).+\(dot\).+)&gt\;</td>')



In [11]:

    
#open file
file=open('email_sample.html','r')



In [12]:

    
email=[]
for line in file:
    m=regex.search(line)
    try:
        email.append((m.group(1),m.group(2)))
    except:
        pass
print 'Total Email Extracted: ',len(email)
print 'sample Email :',email[0]









    



Total Email Extracted:  88262
sample Email : ('Siddhartha Roy', 'siddhartha.r85(at)gmail(dot)com')



In [8]:

    
email[1]









    Out[8]:





('Akash Rathi', 'akrathi9945(at)gmail(dot)com')

replacing (dot) & (at)



In [17]:

    
#Testing
replaceDot = re.compile(r'\(dot\)')
replaceAt = re.compile(r'\(at\)')

temp = replaceDot.sub('.',email[0][1])
print replaceAt.sub('@',temp)









    



siddhartha.r85@gmail.com



In [21]:

    
replaceDot = re.compile(r'\(dot\)')
replaceAt = re.compile(r'\(at\)')

PureEmail_data=[]
for rec in email:
    temp = replaceDot.sub('.',rec[1])
    PureEmail_data.append((rec[0],replaceAt.sub('@',temp)))

print "sample result :",PureEmail_data[0]









    



sample result : ('Siddhartha Roy', 'siddhartha.r85@gmail.com')



In [23]:

    
print 'Total Harvested Email :',len(PureEmail_data)
del email









    



Total Harvested Email : 88262



In [ ]: